#! /usr/bin/python
import numpy as np
import pandas as pd

users = pd.read_table('users.dat',
        sep='::', header=None, 
        names=['user_id', 'gender', 'age', 'occupation', 'zip'])

ratings = pd.read_table('ratings.dat',
        sep='::', header=None, 
        names=['user_id', 'movie_id', 'rating', 'timestamp'])

movies = pd.read_table('movies.dat',
        sep='::', header=None, 
        names=['movie_id', 'title', 'genres'])

# show how one of them looks

rating_head = ratings.head(5)
print rating_head 
print rating_head.movie_id !=661

movielens = pd.merge(pd.merge(ratings, users), movies)
print movielens.head()

# let's work with a smaller subset for speed reasons
movielens = movielens.ix[np.random.choice(movielens.index, size=40000, replace=False)]
print movielens.shape
print movielens.user_id.nunique()
print movielens.movie_id.nunique()

user_ids_larger_1 = pd.value_counts(movielens.user_id, sort=False) > 1
user_ids_larger_1 = user_ids_larger_1[user_ids_larger_1].index

movielens = movielens.select(lambda l: movielens.loc[l, 'user_id'] in user_ids_larger_1)
print movielens.shape
assert np.all(movielens.user_id.value_counts() > 1)

# We now generate train and test subsets by marking 20% of each users's ratings, using groupby and apply.
def assign_to_set(df):
    sampled_ids = np.random.choice(df.index,
                                   size=np.int64(np.ceil(df.index.size * 0.2)),
                                   replace=False)
    df.ix[sampled_ids, 'for_testing'] = True
    return df

movielens['for_testing'] = False
grouped = movielens.groupby('user_id', group_keys=False).apply(assign_to_set)
movielens_train = movielens[grouped.for_testing == False]
movielens_test = movielens[grouped.for_testing == True]
print movielens.shape
print movielens_train.shape
print movielens_test.shape
assert len(movielens_train.index & movielens_test.index) == 0


# Store these two sets in text files:
# movielens_train.to_csv('data/my_generated_movielens_train.csv')
# movielens_test.to_csv('data/my_generated_movielens_test.csv')

def compute_rmse(y_pred, y_true):
    """ Compute Root Mean Squared Error. """
    
    return np.sqrt(np.mean(np.power(y_pred - y_true, 2)))

def evaluate(estimate_f):
    """ RMSE-based predictive performance evaluation with pandas. """
    
    ids_to_estimate = zip(movielens_test.user_id, movielens_test.movie_id)
    estimated = np.array([estimate_f(u,i) for (u,i) in ids_to_estimate])
    real = movielens_test.rating.values
    print estimated.size
    print real.size
    return compute_rmse(estimated, real)

def estimate1(user_id, movie_id):
    """ Simple content-filtering based on mean ratings. """
    
    user_condition = movielens_train.user_id == user_id
    # print movielens_train.loc[user_condition, 'rating']
    return movielens_train.loc[user_condition, 'rating'].mean()
print 'RMSE for estimate1: %s' % evaluate(estimate1)

def estimate0(user_id, movie_id):
    """ Simple content-filtering based on mean ratings. """
    
    user_condition = movielens_train.user_id == user_id
    # print movielens_train.loc[user_condition, 'rating']
    return np.random.rand() * 5 #movielens_train.loc[user_condition, 'rating'].mean()
print 'RMSE for estimate0: %s' % evaluate(estimate0)

def estimate2(user_id, movie_id):
    """ Simple collaborative filter based on mean ratings. """
    
    user_condition = movielens_train.user_id != user_id
    movie_condition = movielens_train.movie_id == movie_id
    ratings_by_others = movielens_train.loc[user_condition & movie_condition]
    if ratings_by_others.empty: 
        return 3.0
    else:
        return ratings_by_others.rating.mean()

print 'RMSE for estimate2: %s' % evaluate(estimate2)

user_info = users.set_index('user_id')
# user_info.head(5)

def estimate3(user_id, movie_id):
    """ Collaborative filtering using an implicit sim(u,u'). """
    
    user_condition = movielens_train.user_id != user_id
    movie_condition = movielens_train.movie_id == movie_id
    ratings_by_others = movielens_train.loc[user_condition & movie_condition]
    if ratings_by_others.empty: 
        return 3.0
    
    means_by_gender = ratings_by_others.pivot_table('rating', rows='movie_id', cols='age')
    user_gender = user_info.ix[user_id, 'age']
    if user_gender in means_by_gender.columns: 
        return means_by_gender.ix[movie_id, user_gender]
    else:
        return means_by_gender.ix[movie_id].mean()

print 'RMSE for reco3: %s' % evaluate(estimate3)

def pearson(s1, s2):
    """Take two pd.Series objects and return a pearson correlation."""
    s1_c = s1 - s1.mean()
    s2_c = s2 - s2.mean()
    return np.sum(s1_c * s2_c) / np.sqrt(np.sum(s1_c ** 2) * np.sum(s2_c ** 2))

class Reco5:
    """ Collaborative filtering using a custom sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.all_user_profiles = movielens.pivot_table('rating', rows='movie_id', cols='user_id')

    def estimate(self, user_id, movie_id):
        """ Ratings weighted by correlation similarity. """
        
        user_condition = movielens_train.user_id != user_id
        movie_condition = movielens_train.movie_id == movie_id
        ratings_by_others = movielens_train.loc[user_condition & movie_condition]
        if ratings_by_others.empty: 
            return 3.0
        
        ratings_by_others.set_index('user_id', inplace=True)
        their_ids = ratings_by_others.index
        their_ratings = ratings_by_others.rating
        their_profiles = self.all_user_profiles[their_ids]
        user_profile = self.all_user_profiles[user_id]
        sims = their_profiles.apply(lambda profile: pearson(profile, user_profile), axis=0)
        ratings_sims = pd.DataFrame({'sim': sims, 'rating': their_ratings})
        ratings_sims = ratings_sims[ratings_sims.sim > 0]
        if ratings_sims.empty:
            return their_ratings.mean()
        else:
            return np.average(ratings_sims.rating, weights=ratings_sims.sim)
        
reco = Reco5()
reco.learn()
print 'RMSE for reco5: %s' % evaluate(reco.estimate)
